On cherche à étudier l’effet de trois facteurs sur le transcriptome des racines d’Arabidopsis thaliana. Le CO2, au cours des études préliminaires, s’est montré peu influent en conditions contrôles de fer et de nitrates, et accentué en cas de stress nutritionnel. Nous reprennons ces résultats avec des fonctions génériques et propres pour en faire le résumé et de jolis graphes.
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
On a, pour chaque gène et chaque condition, son niveau d’expression en sortie de quantification. On labelle les conditions avec le code suivant : lettre majuscule pour le niveau fort, minuscule pour le niveau faible. Le réplicat est donné après l’underscore.
# quantification file
data <- read.csv("quantifFiles/QuantifGenes.csv", h = T, sep = ",")
rownames(data) <- data$Gene
genes = which(!(grepl("__", rownames(data))))
not_quant = data[which((grepl("__", rownames(data)))), ]
data = data[genes, grepl("R", colnames(data))]
getLabel("R6")[1] "cnF_3"
[1] At_AmbientCO2_LowNitrate_Fe1
48 Levels: At_AmbientCO2_HighNitrate_Fe1 ... Sl_ElevatedCO2_LowNitrate_FeStarvation3
[1] "At_AmbientCO2_LowNitrate_Fe"
keep <- rowSums(data) >= 10
data <- data[keep, ]
group <- sapply(colnames(data), getLabel, with.rep = F)
colnames(data) <- sapply(colnames(data), getLabel)
head(data) cNF_3 cNF_2 cNF_1 cnF_2 cnF_1 cnF_3 CNF_1 CnF_2 CnF_1 CnF_3 cNf_1
AT1G01010 1526 1006 1116 1275 967 1018 854 1132 1294 1364 2325
AT1G01020 416 285 289 349 364 370 226 513 502 561 461
AT1G01030 31 15 19 29 36 28 12 47 34 47 18
cnf_2 cnf_1 cNf_2 cNf_3 cnf_3 Cnf_3 CNf_1 Cnf_1 Cnf_2 CNf_3 CNf_2
AT1G01010 2113 2193 2564 2364 2074 1987 2027 1754 1697 1537 1898
AT1G01020 407 432 614 380 502 484 467 426 415 413 462
AT1G01030 40 32 44 37 27 42 39 36 40 37 37
CNF_3 CNF_2
AT1G01010 816 912
AT1G01020 223 312
AT1G01030 15 19
[ reached 'max' / getOption("max.print") -- omitted 3 rows ]
[1] 23342 24
On définit les conditions contrôle comme suit : fort nitrate et fort fer.
method = "edger"
g = list()
# reference condition as first element of labels
labels <- c("cNF", "CNF")
genes1 <- dualDE(data, labels, pval = 0.01, method = method) (Intercept) groupCNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cNF_3" "cNF_2" "cNF_1" "CNF_1" "CNF_3" "CNF_2"
cNF_3 cNF_2 cNF_1 CNF_1 CNF_3 CNF_2
1.0090325 1.0047763 0.9805842 1.0116521 0.9699686 1.0239863
[1] "68 genes DE"
[1] 17
(Intercept) groupCnF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnF_2" "cnF_1" "cnF_3" "CnF_2" "CnF_1" "CnF_3"
cnF_2 cnF_1 cnF_3 CnF_2 CnF_1 CnF_3
0.9862253 0.9873030 0.9924054 1.0167225 1.0047221 1.0126218
[1] "1312 genes DE"
(Intercept) groupCNf
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cNf_1" "cNf_2" "cNf_3" "CNf_1" "CNf_3" "CNf_2"
cNf_1 cNf_2 cNf_3 CNf_1 CNf_3 CNf_2
0.9799002 1.0044883 0.9723876 0.9981139 1.0241404 1.0209697
[1] "2219 genes DE"
(Intercept) groupCnf
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnf_2" "cnf_1" "cnf_3" "Cnf_3" "Cnf_1" "Cnf_2"
cnf_2 cnf_1 cnf_3 Cnf_3 Cnf_1 Cnf_2
0.9975633 1.0046643 1.0015093 0.9727479 1.0064673 1.0170479
[1] "1656 genes DE"
On visualise les gènes différentiellement exprimés en commun entre les différents niveaux des autres facteurs.
library(ggVennDiagram)
library(VennDiagram)
gene_list <- list()
for (comp in names(g)) {
print(g[[comp]])
gene_list[[comp]] <- g[[comp]]$gene_id
} gene_id a.value m.value p.value q.value rank estimatedDEG
1 AT5G64120 8.797218 -1.1121447 1.895561e-15 4.424619e-11 1 1
2 AT1G19250 9.948535 -0.8706103 1.978672e-14 1.946612e-10 2 1
3 AT4G25100 9.324940 -0.9038503 2.501858e-14 1.946612e-10 3 1
4 AT5G59520 8.819711 -1.2066060 1.693286e-12 9.037014e-09 4 1
5 AT2G01520 14.038784 -0.9716064 1.935784e-12 9.037014e-09 5 1
6 AT1G67270 5.104540 -2.2140173 3.326926e-12 1.294285e-08 6 1
7 AT5G23980 8.719121 -1.1156310 2.014845e-10 6.718644e-07 7 1
8 AT4G37060 10.229806 -0.8191550 6.994317e-10 2.040767e-06 8 1
9 AT5G23990 4.823373 -2.0335514 9.553935e-10 2.477866e-06 9 1
upreg
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
[ reached 'max' / getOption("max.print") -- omitted 59 rows ]
gene_id a.value m.value p.value q.value rank estimatedDEG
1 AT5G61600 10.978022 1.715345 5.768501e-56 1.346484e-51 1 1
2 AT4G27450 10.987889 1.361106 1.241560e-51 1.449024e-47 2 1
3 AT1G27730 9.421760 2.187544 2.462590e-51 1.916059e-47 3 1
4 AT5G51190 8.417337 2.782680 4.320997e-49 2.521518e-45 4 1
5 AT2G27830 9.879502 1.316552 1.397834e-39 6.525650e-36 5 1
6 AT1G19530 12.542251 1.538979 1.842265e-38 7.167027e-35 6 1
7 AT4G27280 10.488397 1.691216 4.608884e-38 1.536865e-34 7 1
8 AT5G13930 10.407278 -1.747350 6.719526e-38 1.960590e-34 8 1
9 AT3G44260 8.737846 2.483672 1.820216e-36 4.720832e-33 9 1
upreg
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 0
9 1
[ reached 'max' / getOption("max.print") -- omitted 1303 rows ]
gene_id a.value m.value p.value q.value rank estimatedDEG
1 AT1G78340 10.002535 -3.261719 5.399049e-136 1.260246e-131 1 1
2 AT4G13180 10.301728 -2.811270 3.011796e-121 3.515067e-117 2 1
3 AT4G15550 8.217350 -3.781600 5.522983e-117 4.297249e-113 3 1
4 AT4G34131 8.946763 -2.699004 1.584031e-95 9.243612e-92 4 1
5 AT1G76680 9.778808 -3.051667 2.789158e-90 1.302091e-86 5 1
6 AT3G28345 9.654586 -2.538451 3.170729e-89 1.233519e-85 6 1
7 AT3G45060 9.161753 2.510301 5.415418e-82 1.805810e-78 7 1
8 AT1G76690 9.762616 -2.235867 5.461401e-76 1.593500e-72 8 1
9 AT3G28740 4.011621 -7.093650 9.075419e-76 2.353760e-72 9 1
upreg
1 0
2 0
3 0
4 0
5 0
6 0
7 1
8 0
9 0
[ reached 'max' / getOption("max.print") -- omitted 2210 rows ]
gene_id a.value m.value p.value q.value rank estimatedDEG
1 AT3G49620 8.971142 3.325629 4.576332e-77 1.068207e-72 1 1
2 AT2G26400 12.032573 -2.107176 1.946526e-64 2.271791e-60 2 1
3 AT1G21100 9.986600 1.996729 3.562033e-43 2.771499e-39 3 1
4 AT1G69530 9.527594 -1.507353 2.066807e-40 1.206085e-36 4 1
5 AT5G51860 6.042441 3.573607 4.906245e-40 2.290431e-36 5 1
6 AT5G51870 8.032896 1.826646 6.948595e-39 2.703235e-35 6 1
7 AT1G08630 8.692334 2.049414 1.918515e-37 6.397426e-34 7 1
8 AT1G56600 8.952644 -1.850385 1.108793e-36 3.235181e-33 8 1
9 AT1G43800 11.887403 -2.934754 2.086789e-36 5.412203e-33 9 1
upreg
1 1
2 0
3 1
4 0
5 1
6 1
7 1
8 0
9 0
[ reached 'max' / getOption("max.print") -- omitted 1647 rows ]
partitions <- get.venn.partitions(gene_list, keep.elements = T)
partitions$shared <- rowSums(partitions[1:4])
partitions <- partitions[order(-partitions$shared), ]
sharedBy3 <- unique(unlist(subset(partitions, partitions$shared == 3)$..values..))
a <- OntologyProfile(sharedBy3, specie = specie) ensembl_gene_id
1 AT1G02640
2 AT1G11185
3 AT1G11260
4 AT1G11920
5 AT1G12040
6 AT1G12805
7 AT1G17180
8 AT1G18400
9 AT1G19250
10 AT1G27950
11 AT1G29100
12 AT1G33055
13 AT1G43800
14 AT1G48930
15 AT1G54950
16 AT1G54970
17 AT1G55790
18 AT1G56580
description
1 Probable beta-D-xylosidase 2 [Source:UniProtKB/Swiss-Prot;Acc:Q94KD8]
2 other RNA [Source:TAIR;Acc:AT1G11185]
3 STP1 [Source:UniProtKB/TrEMBL;Acc:A0A178WJ63]
4 Putative pectate lyase 2 [Source:UniProtKB/Swiss-Prot;Acc:O65388]
5 Leucine-rich repeat extensin-like protein 1 [Source:UniProtKB/Swiss-Prot;Acc:O65375]
6 Nucleotide binding protein [Source:UniProtKB/TrEMBL;Acc:Q3EDD5]
7 Glutathione S-transferase U25 [Source:UniProtKB/Swiss-Prot;Acc:Q9SHH7]
8 Transcription factor BEE 1 [Source:UniProtKB/Swiss-Prot;Acc:Q8GZ13]
9 Probable flavin-containing monooxygenase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q9LMA1]
10 Non-specific lipid transfer protein GPI-anchored 1 [Source:UniProtKB/Swiss-Prot;Acc:Q9C7F7]
11 Heavy metal transport/detoxification superfamily protein [Source:TAIR;Acc:AT1G29100]
12 Uncharacterized protein unannotated coding sequence from BAC F9L11 [Source:UniProtKB/TrEMBL;Acc:Q94K21]
13 Stearoyl-[acyl-carrier-protein] 9-desaturase 6, chloroplastic [Source:UniProtKB/Swiss-Prot;Acc:Q84VY3]
14 Endoglucanase 5 [Source:UniProtKB/Swiss-Prot;Acc:Q9M995]
15 unknown protein; FUNCTIONS IN: molecular_function unknown; INVOLVED IN: biological_process unknown; LOCATED IN: endomembrane system; BEST Arabidopsis thaliana protein match is: unknown protein (TAIR:AT1G07690.1); Ha. [Source:TAIR;Acc:AT1G54950]
16 Proline-rich protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q9FZ35]
17 Domain of unknown function (DUF2431) [Source:TAIR;Acc:AT1G55790]
18 At1g56580/F25P12_18 [Source:UniProtKB/TrEMBL;Acc:Q9FXB0]
external_gene_name entrezgene_id
1 BXL2 837940
2 NA
3 STP1 837667
4 837742
5 LRX1 837756
6 2745749
7 GSTU25 838289
8 BEE1 838421
9 FMO1 838508
10 LTPG1 839688
11 839785
12 840201
13 S-ACP-DES6 840977
14 AtGH9C1 841315
15 841935
16 PRP1 841938
17 NA
18 SVB 842112
[ reached 'max' / getOption("max.print") -- omitted 82 rows ]
For each comparison, we want to see how many genes are detected as DE.
# getExpression('AT2G35980', conds = c('cNF', 'CNF'))
d <- data.frame(matrix(ncol = 4, nrow = 2))
colnames(d) <- names(g)
row.names(d) <- c("up", "down")
for (comp in names(g)) {
d["up", comp] <- sum(g[[comp]]$upreg == 1)
d["down", comp] <- sum(g[[comp]]$upreg == 0)
}
res <- melt(d)
res$reg = rep(c("up", "down"), 4)
genesco2At <- res
save(genesco2At, file = paste0("genesco2", specie, ".RData"))
ggplot(res, aes(fill = reg, y = value, x = variable)) + geom_bar(position = "stack",
stat = "identity", alpha = 0.5, color = "black") + ggtitle("Carbon dioxide effet on gene regulation") +
xlab("") + ylab("Number of differentially expressed genes") + coord_flip()On s’interroge sur l’effet de la double carence fer et nitrate en CO2 ambiant et élevé. On ne l’avait pas fait car cela fait varer deux facteurs à la fois.
(Intercept) groupcNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "cnf_2" "cnf_1" "cnf_3" "cNF_3" "cNF_2" "cNF_1"
cnf_2 cnf_1 cnf_3 cNF_3 cNF_2 cNF_1
1.0242524 1.0393358 1.0253221 0.9831632 0.9914864 0.9364401
[1] "9069 genes DE"
(Intercept) groupCNF
1 1 0
2 1 0
3 1 0
4 1 1
5 1 1
6 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
[1] "Cnf_3" "Cnf_1" "Cnf_2" "CNF_1" "CNF_3" "CNF_2"
Cnf_3 Cnf_1 Cnf_2 CNF_1 CNF_3 CNF_2
0.9823882 1.0318759 1.0281336 1.0008911 0.9447685 1.0119428
[1] "8309 genes DE"
doubleInt <- list(`ambiant CO2` = genes5$gene_id, `elevanted CO2` = genes6$gene_id)
ggVennDiagram(doubleInt) ensembl_gene_id
1 AT1G01720
2 AT1G01750
3 AT1G02575
4 AT1G02640
5 AT1G02850
6 AT1G03790
7 AT1G03870
8 AT1G04250
9 AT1G05650
10 AT1G05660
11 AT1G06120
12 AT1G07680
13 AT1G07690
14 AT1G07750
15 AT1G08080
16 AT1G08100
17 AT1G08290
18 AT1G08500
description
1 NAC domain-containing protein 2 [Source:UniProtKB/Swiss-Prot;Acc:Q39013]
2 actin depolymerizing factor 11 [Source:TAIR;Acc:AT1G01750]
3 unknown protein; BEST Arabidopsis thaliana protein match is: unknown protein (TAIR:AT1G02570.1). [Source:TAIR;Acc:AT1G02575]
4 Probable beta-D-xylosidase 2 [Source:UniProtKB/Swiss-Prot;Acc:Q94KD8]
5 Beta-glucosidase 11 [Source:UniProtKB/Swiss-Prot;Acc:B3H5Q1]
6 Zinc finger CCCH domain-containing protein 2 [Source:UniProtKB/Swiss-Prot;Acc:Q9ZWA1]
7 At1g03870 [Source:UniProtKB/TrEMBL;Acc:B3LF88]
8 Auxin-responsive protein IAA17 [Source:UniProtKB/Swiss-Prot;Acc:P93830]
9 F3F20.10 protein [Source:UniProtKB/TrEMBL;Acc:Q9SYK6]
10 F3F20.11 protein [Source:UniProtKB/TrEMBL;Acc:Q9SYK7]
11 Delta-9 desaturase-like 3 protein [Source:UniProtKB/Swiss-Prot;Acc:Q9FPD5]
12 F24B9.23 [Source:UniProtKB/TrEMBL;Acc:Q9LQP3]
13 FUNCTIONS IN: molecular_function unknown; INVOLVED IN: biological_process unknown; LOCATED IN: endomembrane system; EXPRESSED IN: root; CONTAINS InterPro DOMAIN/s: Orthoreovirus membrane fusion p10 (InterPro:IPR009854); BEST Arabidopsis thaliana pro /.../atch is: unknown protein (TAIR:AT1G54950.1); Ha. [Source:TAIR;Acc:AT1G07690]
14 At1g07750/F24B9_13 [Source:UniProtKB/TrEMBL;Acc:Q9LQQ3]
15 Alpha carbonic anhydrase 7 [Source:UniProtKB/Swiss-Prot;Acc:Q8L817]
16 NRT2 [Source:UniProtKB/TrEMBL;Acc:A0A178W7F7]
17 Zinc finger protein WIP3 [Source:UniProtKB/Swiss-Prot;Acc:Q9SGD1]
18 Early nodulin-like protein 18 [Source:UniProtKB/TrEMBL;Acc:O82083]
external_gene_name entrezgene_id
1 NAC002 839265
2 ADF11 839281
3 10723100
4 BXL2 837940
5 BGLU11 839435
6 SOM 839408
7 FLA9 839384
8 IAA17 839568
9 837072
10 837073
11 837121
12 837281
13 837282
14 837290
15 ACA7 837326
16 NRT2.2 837328
17 WIP3 837349
18 ENODL18 837371
[ reached 'max' / getOption("max.print") -- omitted 6632 rows ]
On trouve énormément de gènes, ce qui est cohérent car le nitrate et le fer pris séparément avaient déjà beaucoup d’effet.
On cherche à savoir si il n’y pas pas un soucis avec l’échantillon CNF, qui semble avoi très très peu de différences avec cNF. Aurait-on envoyé les mêmes tubes?
On ne voit que très peu de DEGs en comparant directement cNF et CNF, on cherche donc à savoir si on trouve le même nombre de DEGs quand on compare cNF - x et CNF -x, ce qui validerait l’hypothèse d’un transcriptôme quasi identique. On choisit ici par exemple \(x=\) cnF.
# res <- c() for(x in c('cnF', 'cnf', 'cNf', 'CnF', 'Cnf', 'CNf')){ labels <-
# c('cNF', x) genes7 <- dualDE(data, labels, pval = 0.01, plot = F) labels <-
# c('CNF', x) genes8 <- dualDE(data, labels, pval = 0.01, plot = F) double <-
# list('ambiant CO2' = genes7$gene_id, 'elevanted CO2' = genes8$gene_id)
# print(ggVennDiagram(double)) partitions <- get.venn.partitions(double) res <-
# c(res, (partitions[2, '..count..']+partitions[3,
# '..count..'])/sum(partitions[,'..count..'])*100) } print('Pourcentage des gènes
# DE qui ne sont pas partagés par les deux comparaisons :') print(res)Il semble qu’il y ait quand même pas mal de différences entre ces transcriptomes, car quand on les compare au même troisième transcriptome on a une différence de 1000 gènes (1900 contre 2800). Le diagramme de Venn montre que ces gènes sont en partie différents. Les transcriptômes ont bien l’air différents. Si on prend un autre x :
# labels <- c('cNF', 'cNf') genes7 <- dualDE(data, labels, pval = 0.01) labels <-
# c('CNF', 'cNf') genes8 <- dualDE(data, labels, pval = 0.01) double <-
# list('ambiant CO2' = genes7$gene_id, 'elevanted CO2' = genes8$gene_id)
# ggVennDiagram(double) OntologyProfile(intersect(genes7$gene_id,
# genes8$gene_id), specie = specie)On va faire les corrélations d’expression entre ces deux conditions.
scatter <- function(df, x, y) {
sp <- ggplot(df, aes(log(df[, x]), log(df[, y]))) + geom_bin2d(bins = 70) + scale_fill_gradient2() +
theme(legend.position = "none") + labs(x = x, y = y)
return(sp)
}
comps <- labels <- c("cNF", "CNF")
diffs <- c()
# correlations between ambient and elevated
for (ambient in colnames(data)[grepl("cNF", colnames(data))]) {
for (elevated in colnames(data)[grepl("CNF", colnames(data))]) {
diffs = c(diffs, cor(data[ambient], data[elevated]))
}
}
diffs_other <- c()
for (ambient in colnames(data)[grepl("cNF", colnames(data))]) {
for (elevated in colnames(data)[!grepl("NF", colnames(data))]) {
diffs_other = c(diffs_other, cor(data[ambient], data[elevated]))
}
}
# correlations within ambien and elevated
sames <- c()
samples <- colnames(data)[grepl("cNF", colnames(data))]
sames <- c(sames, cor(data[, samples[1]], data[, samples[2]]), cor(data[, samples[1]],
data[, samples[3]]), cor(data[, samples[2]], data[, samples[3]]))
samples <- colnames(data)[grepl("CNF", colnames(data))]
sames <- c(sames, cor(data[, samples[1]], data[, samples[2]]), cor(data[, samples[1]],
data[, samples[3]]), cor(data[, samples[2]], data[, samples[3]]))
sames[1] 0.9938199 0.9906937 0.9810858 0.9860430 0.9921428 0.9860068
df <- data.frame(`Same condition` = c(sames, rep(NA, 3)), `Ambient vs Elevated` = diffs)
res <- na.omit(melt(df))
# comparisons
ggplot(data = res, aes(x = variable, y = value, fill = variable)) + geom_boxplot(binaxis = "y",
alpha = 0.4, stackdir = "center") + theme(axis.text.x = element_text(angle = 320,
hjust = 0, colour = "grey50"), plot.title = element_text(size = 14, face = "bold")) +
ggtitle("Correlation for expression accross conditions") + geom_dotplot(binaxis = "y",
stackdir = "center") + stat_compare_means(method = "wilcox.test", hide.ns = FALSE,
label = "p.signif", symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05,
1), symbols = c("****", "***", "**", "*", "ns")))res <- rbind.data.frame(res, data.frame(variable = rep("cNF vs all others except CNF",
length(diffs_other)), value = diffs_other))
ggplot(data = res, aes(x = variable, y = value, fill = variable)) + geom_boxplot(binaxis = "y",
alpha = 0.4, stackdir = "center") + theme(axis.text.x = element_text(angle = 320,
hjust = 0, colour = "grey50"), plot.title = element_text(size = 14, face = "bold")) +
ggtitle("Correlation for expression accross conditions") + geom_dotplot(binaxis = "y",
stackdir = "center") + stat_compare_means(method = "wilcox.test", hide.ns = FALSE,
label = "p.signif", symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05,
1), symbols = c("****", "***", "**", "*", "ns"))) cNF et CNF semblent bien plus proches entre eux que cNF ne l’est avec tous les autres.